{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 数据预处理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1. 读取附件1中数据\n",
    "2. 去除无用属性\n",
    "3. 使用箱线图检测异常值并去除\n",
    "4. 去除实际金额为0的数据\n",
    "5. 将附件2中的商品信息添加到附件1中\n",
    "6. 按地点分割数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 0. 导入模块"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>订单号</th>\n",
       "      <th>设备ID</th>\n",
       "      <th>应付金额</th>\n",
       "      <th>实际金额</th>\n",
       "      <th>商品</th>\n",
       "      <th>支付时间</th>\n",
       "      <th>地点</th>\n",
       "      <th>状态</th>\n",
       "      <th>提现</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>DD201708167493663618499909784</td>\n",
       "      <td>E43A6E078A07631</td>\n",
       "      <td>4.5</td>\n",
       "      <td>4.5</td>\n",
       "      <td>68g好丽友巧克力派2枚</td>\n",
       "      <td>2017/1/1 00:53</td>\n",
       "      <td>D</td>\n",
       "      <td>已出货未退款</td>\n",
       "      <td>已提现</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DD201708167493663555814061164</td>\n",
       "      <td>E43A6E078A04172</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>40g双汇玉米热狗肠</td>\n",
       "      <td>2017/1/1 01:33</td>\n",
       "      <td>A</td>\n",
       "      <td>已出货未退款</td>\n",
       "      <td>已提现</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>DD201708167493578526890939886</td>\n",
       "      <td>E43A6E078A06874</td>\n",
       "      <td>5.5</td>\n",
       "      <td>5.5</td>\n",
       "      <td>430g泰奇八宝粥</td>\n",
       "      <td>2017/1/1 08:45</td>\n",
       "      <td>E</td>\n",
       "      <td>已出货未退款</td>\n",
       "      <td>已提现</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>DD201708167493683507186615837</td>\n",
       "      <td>E43A6E078A04228</td>\n",
       "      <td>5.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>48g好丽友薯愿香烤原味</td>\n",
       "      <td>2017/1/1 09:05</td>\n",
       "      <td>C</td>\n",
       "      <td>已出货未退款</td>\n",
       "      <td>已提现</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>DD201708167493759548618252006</td>\n",
       "      <td>E43A6E078A04134</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>600ml可口可乐</td>\n",
       "      <td>2017/1/1 09:41</td>\n",
       "      <td>B</td>\n",
       "      <td>已出货未退款</td>\n",
       "      <td>已提现</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             订单号             设备ID  应付金额  实际金额            商品  \\\n",
       "0  DD201708167493663618499909784  E43A6E078A07631   4.5   4.5  68g好丽友巧克力派2枚   \n",
       "1  DD201708167493663555814061164  E43A6E078A04172   3.0   3.0    40g双汇玉米热狗肠   \n",
       "2  DD201708167493578526890939886  E43A6E078A06874   5.5   5.5     430g泰奇八宝粥   \n",
       "3  DD201708167493683507186615837  E43A6E078A04228   5.0   5.0  48g好丽友薯愿香烤原味   \n",
       "4  DD201708167493759548618252006  E43A6E078A04134   3.0   3.0     600ml可口可乐   \n",
       "\n",
       "             支付时间 地点      状态   提现  \n",
       "0  2017/1/1 00:53  D  已出货未退款  已提现  \n",
       "1  2017/1/1 01:33  A  已出货未退款  已提现  \n",
       "2  2017/1/1 08:45  E  已出货未退款  已提现  \n",
       "3  2017/1/1 09:05  C  已出货未退款  已提现  \n",
       "4  2017/1/1 09:41  B  已出货未退款  已提现  "
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f=open('../数据/附件1.csv')\n",
    "all_sales_data=pd.read_csv(f)\n",
    "all_sales_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. 去除无用属性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>金额</th>\n",
       "      <th>商品</th>\n",
       "      <th>支付时间</th>\n",
       "      <th>地点</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4.5</td>\n",
       "      <td>68g好丽友巧克力派2枚</td>\n",
       "      <td>2017/1/1 00:53</td>\n",
       "      <td>D</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.0</td>\n",
       "      <td>40g双汇玉米热狗肠</td>\n",
       "      <td>2017/1/1 01:33</td>\n",
       "      <td>A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5.5</td>\n",
       "      <td>430g泰奇八宝粥</td>\n",
       "      <td>2017/1/1 08:45</td>\n",
       "      <td>E</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5.0</td>\n",
       "      <td>48g好丽友薯愿香烤原味</td>\n",
       "      <td>2017/1/1 09:05</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.0</td>\n",
       "      <td>600ml可口可乐</td>\n",
       "      <td>2017/1/1 09:41</td>\n",
       "      <td>B</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    金额            商品            支付时间 地点\n",
       "0  4.5  68g好丽友巧克力派2枚  2017/1/1 00:53  D\n",
       "1  3.0    40g双汇玉米热狗肠  2017/1/1 01:33  A\n",
       "2  5.5     430g泰奇八宝粥  2017/1/1 08:45  E\n",
       "3  5.0  48g好丽友薯愿香烤原味  2017/1/1 09:05  C\n",
       "4  3.0     600ml可口可乐  2017/1/1 09:41  B"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#去除属性包括订单号、设备ID、提现、状态、应付金额\n",
    "#将属性名“实际金额”修改为“金额”\n",
    "all_sales_data.drop(columns=[\"订单号\",\"设备ID\",\"提现\",\"状态\",\"应付金额\"],inplace=True)\n",
    "all_sales_data.rename(columns={\"实际金额\":\"金额\"},inplace=True)\n",
    "all_sales_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.使用箱线图检测异常值并去除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "Group=all_sales_data.groupby('商品')\n",
    "#使用箱线图法检查并找出异常值，并提取异常值数据的索引\n",
    "def Outliers(a_list):\n",
    "    outlier_indices = list()\n",
    "    Q1 = np.percentile(a_list, 25)\n",
    "    Q3 = np.percentile(a_list,75)\n",
    "    IQR = Q3 - Q1\n",
    "    outlier_step = 1.5 * IQR\n",
    "    outlier_list_col = a_list[(a_list < Q1 - outlier_step) | (a_list > Q3 + outlier_step )]\n",
    "    return outlier_list_col\n",
    "Out_Index=list()\n",
    "for i in Group.groups:\n",
    "    grp = Group.get_group(i)\n",
    "    outlier = Outliers(grp['金额'])\n",
    "    Out_Index.append(outlier.index)\n",
    "\n",
    "#删除异常数据\n",
    "for i in Out_Index:\n",
    "    all_sales_data.drop(labels = i, axis = 0, inplace = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. 去除金额为0的数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>金额</th>\n",
       "      <th>商品</th>\n",
       "      <th>支付时间</th>\n",
       "      <th>地点</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>408</th>\n",
       "      <td>0.0</td>\n",
       "      <td>85g新麦潮北海道吐司</td>\n",
       "      <td>2017/1/9 08:00</td>\n",
       "      <td>D</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037</th>\n",
       "      <td>0.0</td>\n",
       "      <td>40g双汇玉米热狗肠</td>\n",
       "      <td>2017/1/10 18:24</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1144</th>\n",
       "      <td>0.0</td>\n",
       "      <td>85g新麦潮北海道吐司</td>\n",
       "      <td>2017/1/10 23:36</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495</th>\n",
       "      <td>0.0</td>\n",
       "      <td>85g新麦潮北海道吐司</td>\n",
       "      <td>2017/1/30 15:39</td>\n",
       "      <td>D</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6604</th>\n",
       "      <td>0.0</td>\n",
       "      <td>85g新麦潮北海道吐司</td>\n",
       "      <td>2017/4/28 02:35</td>\n",
       "      <td>A</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       金额           商品             支付时间 地点\n",
       "408   0.0  85g新麦潮北海道吐司   2017/1/9 08:00  D\n",
       "1037  0.0   40g双汇玉米热狗肠  2017/1/10 18:24  C\n",
       "1144  0.0  85g新麦潮北海道吐司  2017/1/10 23:36  C\n",
       "1495  0.0  85g新麦潮北海道吐司  2017/1/30 15:39  D\n",
       "6604  0.0  85g新麦潮北海道吐司  2017/4/28 02:35  A"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_0=all_sales_data[all_sales_data['金额']==0]  #金额为0的商品数据\n",
    "data_0.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "#去除金额为0的商品数据\n",
    "data_index=data_0.index\n",
    "all_sales_data.drop(index=data_index,inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. 添加商品信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>商品</th>\n",
       "      <th>大类</th>\n",
       "      <th>二级类</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100g*5瓶益力多</td>\n",
       "      <td>饮料</td>\n",
       "      <td>乳制品</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>100g越南LIPO奶味面包干</td>\n",
       "      <td>非饮料</td>\n",
       "      <td>饼干糕点</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10g卫龙亲嘴烧香辣味</td>\n",
       "      <td>非饮料</td>\n",
       "      <td>肉干/豆制品/蛋</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10g越南LIPO奶味面包干</td>\n",
       "      <td>非饮料</td>\n",
       "      <td>饼干糕点</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>110g顺宝九制话梅</td>\n",
       "      <td>非饮料</td>\n",
       "      <td>蜜饯/果干</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                商品   大类       二级类\n",
       "0       100g*5瓶益力多   饮料       乳制品\n",
       "1  100g越南LIPO奶味面包干  非饮料      饼干糕点\n",
       "2      10g卫龙亲嘴烧香辣味  非饮料  肉干/豆制品/蛋\n",
       "3   10g越南LIPO奶味面包干  非饮料      饼干糕点\n",
       "4       110g顺宝九制话梅  非饮料     蜜饯/果干"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f=open('../数据/附件2.csv')\n",
    "info_data=pd.read_csv(f)\n",
    "info_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>金额</th>\n",
       "      <th>商品</th>\n",
       "      <th>支付时间</th>\n",
       "      <th>地点</th>\n",
       "      <th>商品大类</th>\n",
       "      <th>商品二级类</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4.5</td>\n",
       "      <td>68g好丽友巧克力派2枚</td>\n",
       "      <td>2017/1/1 00:53</td>\n",
       "      <td>D</td>\n",
       "      <td>非饮料</td>\n",
       "      <td>饼干糕点</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.0</td>\n",
       "      <td>40g双汇玉米热狗肠</td>\n",
       "      <td>2017/1/1 01:33</td>\n",
       "      <td>A</td>\n",
       "      <td>非饮料</td>\n",
       "      <td>肉干/豆制品/蛋</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5.0</td>\n",
       "      <td>48g好丽友薯愿香烤原味</td>\n",
       "      <td>2017/1/1 09:05</td>\n",
       "      <td>C</td>\n",
       "      <td>非饮料</td>\n",
       "      <td>膨化食品</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.0</td>\n",
       "      <td>600ml可口可乐</td>\n",
       "      <td>2017/1/1 09:41</td>\n",
       "      <td>B</td>\n",
       "      <td>饮料</td>\n",
       "      <td>碳酸饮料</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>4.5</td>\n",
       "      <td>营养快线</td>\n",
       "      <td>2017/1/1 09:41</td>\n",
       "      <td>B</td>\n",
       "      <td>饮料</td>\n",
       "      <td>乳制品</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    金额            商品            支付时间 地点 商品大类     商品二级类\n",
       "0  4.5  68g好丽友巧克力派2枚  2017/1/1 00:53  D  非饮料      饼干糕点\n",
       "1  3.0    40g双汇玉米热狗肠  2017/1/1 01:33  A  非饮料  肉干/豆制品/蛋\n",
       "3  5.0  48g好丽友薯愿香烤原味  2017/1/1 09:05  C  非饮料      膨化食品\n",
       "4  3.0     600ml可口可乐  2017/1/1 09:41  B   饮料      碳酸饮料\n",
       "5  4.5          营养快线  2017/1/1 09:41  B   饮料       乳制品"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "first_type=dict(map(lambda x,y:[x,y],list(info_data['商品']),list(info_data['大类'])))\n",
    "second_type=dict(map(lambda x,y:[x,y],list(info_data['商品']),list(info_data['二级类'])))\n",
    "all_sales_data['商品大类']=all_sales_data['商品'].map(first_type)\n",
    "all_sales_data['商品二级类']=all_sales_data['商品'].map(second_type)\n",
    "all_sales_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "#初步处理的数据保存为csv文件，以供后续分析使用\n",
    "all_sales_data.to_csv('../数据/初步处理数据.csv', encoding='gbk',index=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. 按地点提取数据（任务1.1）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "place=['A','B','C','D','E']\n",
    "for pla in place:\n",
    "    place_data=all_sales_data[all_sales_data['地点']==pla]\n",
    "    place_data.to_csv('../数据/分割数据/task1-1'+pla+'.csv', encoding='gbk',index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
